#####################################
### This file prepares the data to be used in the analysis
#####################################


# set your working directory here
setwd("##") 

library(plyr)
library(foreign)
library(countrycode)
library(systemfit)
library(DataCombine)
library(ggplot2)
library(readstata13)
library(tidyverse)

logdelta = .01 # represents how much we add to the log events value

codelist_revised <- codelist_panel %>% # revise the codelist for countrycode panel data
  mutate(cown=ifelse(country.name.en=="Vietnam"&year>1975, 816, cown)) %>% #revise Vietnam given cown is missing after 1975
  mutate(cown=ifelse(country.name.en=="Germany"&year>1945&year<1955, 260, cown)) #revise Germany by counting 1946-54 as West Germany, not satisfying, but not matter here since we use data after 1994

## 1. set up data structure ---------------------------------------------------
## directed country year code
## from here https://sumtxt.wordpress.com/2015/07/29/dyad-year-dataset-with-tidyr-dplyr/
## ----------------------------------------------------------------------------

system <- read.csv(url("http://www.correlatesofwar.org/data-sets/state-system-membership/states2016/at_download/file"))
system <- system %>%  dplyr::select(ccode, styear, endyear) 
system <- system %>%  tidyr::expand(ccode1=ccode, ccode2=ccode, year=seq(1995,2013)) %>%
  filter(ccode1!=ccode2) %>% 
  left_join(., system, by=c("ccode1"="ccode")) %>%
  filter(year >= styear & year <= endyear) %>%
  dplyr::select(-styear,-endyear) %>% 
  left_join(., system, by=c("ccode2"="ccode")) %>%
  filter(year >= styear & year <= endyear) %>%
  dplyr::select(-styear,-endyear)

## 2. load covariates ---------------------------------------------------------
## ----------------------------------------------------------------------------

## dyadic trade data
final.dytrade <- read.dta("RawData/dytrade.dta", convert.underscore = TRUE) %>% 
  rename(year=Year) %>% filter(year > 1994 & year < 2014)

## load ICEWS DATA
load("RawData/icews2018.RData")

icewsYearSum_country <- icewsYearSum %>%
  group_by(ccode1, year) %>%
  summarise(coop.all=sum(coopTotal,na.rm=TRUE),
            host.all=sum(hostTotal,na.rm=TRUE)) %>% ungroup()

icews_trade <- system %>% left_join(subset(icewsYearSum, 
                                           select=c("ccode1","ccode2","year","coopTotal","hostTotal")),
                                    by=c("ccode1","ccode2","year")) %>%
  left_join(subset(icewsYearSum, 
                   select=c("ccode1","ccode2","year","coopTotal","hostTotal")),
            by=c("ccode1"="ccode2","ccode2"="ccode1","year"),
            suffix=c("",".reverse")) %>%
  left_join(icewsYearSum_country,by=c("ccode1","year")) %>%
  left_join(icewsYearSum_country,by=c("ccode2"="ccode1","year")) %>%
  mutate(coop.all.x=coop.all.x-coopTotal,
         coop.all.y=coop.all.y-coopTotal.reverse,
         host.all.x=host.all.x-hostTotal,
         host.all.y=host.all.y-hostTotal.reverse) %>% # each country's coop and host with all third parties
  left_join(final.dytrade, by=c("ccode1","ccode2","year"))


icews_trade$major.either <- ifelse(icews_trade$major.import == 1 | icews_trade$major.export == 1, 1, 0)
icews_trade$major.both <- ifelse(icews_trade$major.import == 1 & icews_trade$major.export == 1, 1, 0)

reverse <- icews_trade[,c("ccode1", "ccode2", "year", "major.either")] %>% 
  dplyr::rename(ccode1=ccode2, ccode2=ccode1, major.either.reverse=major.either)

icews_trade <- icews_trade %>% left_join(reverse, by=c("ccode1", "ccode2", "year")) %>%
  mutate(major.sym=as.numeric(major.either == 1 & major.either.reverse == 1),
         major.asym1=as.numeric(major.either == 1 & major.either.reverse == 0),
         major.asym2=as.numeric(major.either == 0 & major.either.reverse == 1))

## control variables
load("RawData/COWPolityControls2018.RData")
qog <- read.csv("RawData/qog_std_ts_jan17.csv")[,c("ccodecow", "year", "wdi_export", "wdi_import", "wdi_oilrent", "wdi_gdppppcon2011", "wdi_gdppppcur","wdi_gdpcappppcon2011", "wdi_pop", "wdi_popurb")]
qog_partner <- subset(qog, select = c(ccodecow, year, wdi_gdppppcon2011))
qog_partner <- qog_partner[complete.cases(qog_partner),]

icews_trade <- left_join(icews_trade, mindist, by=c("ccode1","ccode2","year")) %>%
  left_join(qog_partner, by=c("ccode2"="ccodecow","year")) %>%
  dplyr::rename(wdi_gdppppcon2011.ccode2=wdi_gdppppcon2011)

## export import nhhi
export.nhhi <- read.dta("RawData/all.export.data.dta", convert.underscore = TRUE) %>%
  left_join(subset(codelist_revised, select=c("iso3c","year","cown")), 
            by=c("ReporterISO3"="iso3c","year")) %>% dplyr::rename(ccode=cown) %>%
  filter(!is.na(ccode)) %>% dplyr::select(-ReporterISO3)

import.nhhi <- read.dta("RawData/all.import.data.dta", convert.underscore = TRUE) %>%
  left_join(subset(codelist_revised, select=c("iso3c","year","cown")), 
            by=c("ReporterISO3"="iso3c","year")) %>% dplyr::rename(ccode=cown) %>%
  filter(!is.na(ccode)) %>% dplyr::select(-ReporterISO3)

## Major power
major <- read.csv(url("http://www.correlatesofwar.org/data-sets/state-system-membership/majors2016/at_download/file"))
major <- major %>%  dplyr::select(ccode, styear, endyear) 
major <- major %>%  tidyr::expand(ccode1=ccode, year=seq(1816,2016)) %>% 
  left_join(., major, by=c("ccode1"="ccode")) %>% 
  filter(year >= styear & year <= endyear) %>%
  dplyr::select(-styear,-endyear) %>%
  mutate(ccode2=ccode1, major1=1, major2=1)

## Add globalization data
kof <- read.dta13("RawData/Data_2018.dta", convert.underscore = TRUE)[c(1:3, 7:8)] %>%
  left_join(subset(codelist_revised, select=c("wb", "year","cown")), 
            by=c("code"="wb", "year")) %>% dplyr::rename(ccode=cown) %>% 
  filter(!is.na(ccode)) %>% mutate(KOFEcGI = KOFEcGI/100, # rescale globalization
                                   KOFEcGIdf = KOFEcGIdf/100)

## change export import value into constant usd
convert <- read.dta13("RawData/convert_to_2005_dollars.dta")

## Add PTA data
## from DESTA: https://www.designoftradeagreements.org/downloads/
pta_end <- read.csv("RawData/list_of_withdrawals_dyads.txt") %>%
  mutate(endyear=year) %>%
  dplyr::select(c(iso1,iso2,endyear,base_treaty))

pta <- read.csv("RawData/list_of_treaties_01_05_dyads.txt") %>%
  mutate(beginyear=year) %>%
  dplyr::select(c(iso1,iso2,beginyear,base_treaty)) %>%
  left_join(pta_end,by=c("base_treaty","iso1","iso2")) %>%
  replace_na(list(endyear=2019)) %>%
  na.omit() #omit 1006 without begin year

pta_ddy <- pta %>%
  rowwise() %>%
  do(data.frame(base_treaty=.$base_treaty,
                iso1=.$iso1,
                iso2=.$iso2,
                year=seq(.$beginyear,.$endyear))) %>%
  ungroup()

pta_ddy <- pta_ddy %>%
  mutate(iso=iso2,
         iso2=iso1,
         iso1=iso) %>%
  dplyr::select(-iso) %>%
  bind_rows(pta_ddy) %>%
  mutate(pta=1)

pta_country <- pta_ddy %>% 
  group_by(iso1, year) %>%
  dplyr::summarise(pta=sum(pta)) %>%
  ungroup()  


pta_ddy_final <- pta_ddy %>%
  group_by(iso1, iso2, year) %>%
  summarise(dyadpta=sum(pta)) %>%
  ungroup() 

## generate iso code for icews_data
icews_trade <- icews_trade %>%
  mutate(iso1=countrycode(ccode1,'cown','iso3n'),
         iso2=countrycode(ccode2,'cown','iso3n')) %>%
  mutate(iso1=ifelse(ccode1==345, 688, iso1),
         iso2=ifelse(ccode2==345, 688, iso2), # given our time range, Yugoslavia to Serbia
         iso1=ifelse(ccode1==347, 900, iso1),
         iso2=ifelse(ccode2==347, 900, iso2)) # Kosovo

## 3. dyad level: merge everything -------------------------------------------
## ---------------------------------------------------------------------------

final.data.dyad <- icews_trade %>% left_join(export.nhhi, by=c("ccode1"="ccode","year")) %>% 
  left_join(export.nhhi, by=c("ccode2"="ccode","year")) %>% 
  left_join(import.nhhi, by=c("ccode1"="ccode","year")) %>% 
  left_join(import.nhhi, by=c("ccode2"="ccode","year")) %>%
  left_join(subset(major, select=c("ccode1","year","major1")), by=c("ccode1","year")) %>%
  left_join(subset(major, select=c("ccode2","year","major2")), by=c("ccode2","year")) %>%
  replace_na(list(major1=0, major2=0)) %>%
  left_join(polity[,c("ccode","year","polity2")], by=c("ccode1"="ccode", "year")) %>%
  left_join(polity[,c("ccode","year","polity2")], by=c("ccode2"="ccode", "year")) %>%
  left_join(cinc[,c("ccode","year","cinc")], by=c("ccode1"="ccode", "year")) %>%
  left_join(cinc[,c("ccode","year","cinc")], by=c("ccode2"="ccode", "year")) %>%
  left_join(qog, by=c("ccode1"="ccodecow", "year")) %>%
  left_join(qog, by=c("ccode2"="ccodecow", "year")) %>%
  left_join(allydyad, by=c("ccode1","ccode2","year")) %>%
  left_join(contiguity[,c("ccode1","ccode2","year","conttype")], by=c("ccode1","ccode2","year")) %>% 
  mutate(caprat=log(cinc.x) - log(cinc.y),
         dem1=polity2.x >= 7,
         dem2=polity2.y >= 7,
         aut1=polity2.x <= -7,
         aut2=polity2.y <= -7,
         alliance=(defense>0|neutrality>0|nonaggression>0),
         contiguity=conttype%in%c(1:5)) %>% 
  replace_na(list(dem1=0, dem2=0, aut1=0, aut2=0, coopTotal=0, hostTotal=0,
                  contiguity=0, alliance=0)) %>%
  mutate(weight.NHHI.both1 = (weight.NHHI.i.x*(imports.value.x/(imports.value.x + export.value.x))) + (weight.NHHI.e.x*(export.value.x/(imports.value.x + export.value.x))),
         weight.HHI.both1 = (weight.HHI.i.x*(imports.value.x/(imports.value.x + export.value.x))) + (weight.HHI.e.x*(export.value.x/(imports.value.x + export.value.x))),
         weight.NHHI.both2 = (weight.NHHI.i.y*(imports.value.y/(imports.value.y + export.value.y))) + (weight.NHHI.e.y*(export.value.y/(imports.value.y + export.value.y))),
         weight.HHI.both2 = (weight.HHI.i.y*(imports.value.y/(imports.value.y + export.value.y))) + (weight.HHI.e.y*(export.value.y/(imports.value.y + export.value.y)))
  ) %>% arrange(ccode1, ccode2, year) %>%
  mutate(dyad = ccode1*1000 + ccode2,
         open1 = (export.value.x + imports.value.x) / wdi_gdppppcur.x,
         open2 = (export.value.y + imports.value.y) / wdi_gdppppcur.y) %>%
  group_by(dyad) %>% 
  mutate(post.coopTotal=lead(coopTotal, 1, order_by=year),
         post.hostTotal=lead(hostTotal, 1, order_by=year)) %>% ungroup() %>% 
  left_join(kof, by=c("ccode1"="ccode", "year")) %>%
  left_join(kof, by=c("ccode2"="ccode", "year")) %>%
  left_join(convert, by=c("year")) %>%
  mutate(dep1 =100*((flow.atob + flow.btoa)*1000) / (wdi_gdppppcur.x),
         dep2 =100*((flow.atob + flow.btoa)*1000) / (wdi_gdppppcur.y),
         dyadic.balance = log(flow.atob + logdelta) - log(flow.btoa + logdelta),
         KOFEcGIdf.x.0 = KOFEcGIdf.x - mean(KOFEcGIdf.x, na.rm = TRUE),
         KOFEcGIdf.y.0 = KOFEcGIdf.y - mean(KOFEcGIdf.y, na.rm = TRUE)) %>% 
  left_join(pta_ddy_final,by=c("iso1","iso2","year")) %>%
  left_join(pta_country,by=c("iso1", "year")) %>%
  left_join(pta_country,by=c("iso2"="iso1","year"),suffix=c("1","2")) %>%
  filter(year < 2013) %>%
  group_by(ccode1, year) %>%
  mutate(KOF.count.maj.1=sum(major.either,na.rm=TRUE),
         KOF.count.maj.both.1=sum(major.both,na.rm=TRUE)) %>%
  ungroup() %>%
  group_by(ccode2, year) %>%
  mutate(KOF.count.maj.2=sum(major.either.reverse,na.rm=TRUE),
         KOF.count.maj.both.2=sum(major.both,na.rm=TRUE)) %>%
  ungroup()

## 4. country level merge ----------------------------------------------------
## ---------------------------------------------------------------------------

final.data <- icews_trade %>% group_by(ccode1,iso1, year) %>% # tally icews_trade by country year
  summarise(max.dist = max(mindist, na.rm = TRUE),
            imports.major = sum((major.import * flow.btoa), na.rm = TRUE),
            imports.all = sum(flow.btoa, na.rm = TRUE),
            exports.major = sum((major.export * flow.atob), na.rm = TRUE),
            exports.all = sum(flow.atob, na.rm = TRUE),
            count.partners = length(ccode1),
            count.major.exporters = sum(major.export, na.rm = TRUE),
            count.major.importers = sum(major.import, na.rm = TRUE),
            count.major.either = sum(major.either, na.rm = TRUE),
            count.major.both = sum(major.both, na.rm = TRUE),
            coop.major.import = sum(coopTotal*major.import, na.rm = TRUE),
            host.major.import = sum(hostTotal*major.import, na.rm = TRUE),
            coop.minor.import = sum(coopTotal*(1-major.import), na.rm = TRUE),
            host.minor.import = sum(hostTotal*(1-major.import), na.rm = TRUE),
            coop.major.export = sum(coopTotal*major.export, na.rm = TRUE),
            host.major.export = sum(hostTotal*major.export, na.rm = TRUE),
            coop.minor.export = sum(coopTotal*(1-major.export), na.rm = TRUE),
            host.minor.export = sum(hostTotal*(1-major.export), na.rm = TRUE),
            coop.major.either = sum(coopTotal*major.either, na.rm = TRUE),
            host.major.either = sum(hostTotal*major.either, na.rm = TRUE),
            coop.minor.either = sum(coopTotal*(1-major.both), na.rm = TRUE),
            host.minor.either = sum(hostTotal*(1-major.both), na.rm = TRUE),   
            coop.major.both = sum(coopTotal*major.both, na.rm = TRUE),
            host.major.both = sum(hostTotal*major.both, na.rm = TRUE),
            coop.minor.both = sum(coopTotal*(1-major.either), na.rm = TRUE),
            host.minor.both = sum(hostTotal*(1-major.either), na.rm = TRUE),
            dist.major.import = sum(mindist*major.import,na.rm = TRUE)/(sum(major.import, na.rm = TRUE)+1),
            dist.major.export = sum(mindist*major.export,na.rm = TRUE)/(sum(major.export, na.rm = TRUE)+1),
            dist.major.either = sum(mindist*major.either, na.rm = TRUE) / (sum(major.either, na.rm = TRUE)+1),
            dist.major.both = sum(mindist*major.both,na.rm = TRUE)/(sum(major.both, na.rm = TRUE)+1),
            dist.minor.import = sum(mindist*(1-major.import),na.rm = TRUE)/(sum((1-major.import), na.rm = TRUE)+1),
            dist.minor.export = sum(mindist*(1-major.export),na.rm = TRUE)/(sum((1-major.export), na.rm = TRUE)+1),
            dist.minor.either = sum(mindist*(1-major.both),na.rm = TRUE)/(sum((1-major.both), na.rm = TRUE)+1),
            dist.minor.both = sum(mindist*(1-major.either),na.rm = TRUE) / (sum((1-major.either), na.rm = TRUE)+1),
            gdp.major.either = sum(wdi_gdppppcon2011.ccode2*major.either, na.rm = TRUE) / (sum(major.either, na.rm = TRUE) + 1),
            gdp.minor.both = sum(wdi_gdppppcon2011.ccode2 * (1 - major.either), na.rm = TRUE) / (sum((1-major.either), na.rm = TRUE)+1),
            wght.gdp.dist.ib = sum(wdi_gdppppcon2011.ccode2 * (1 - major.either) * (1 - (mindist / max.dist)), na.rm = TRUE),
            wght.gdp.dist.ae = sum(wdi_gdppppcon2011.ccode2 * major.either * (1 - (mindist / max.dist)), na.rm = TRUE)
  ) %>% ungroup() %>%
  left_join(export.nhhi, by = c("ccode1"="ccode", "year")) %>%
  left_join(import.nhhi, by = c("ccode1"="ccode", "year")) %>%
  left_join(subset(major, select=c("ccode1","year","major1")), by=c("ccode1", "year")) %>%
  left_join(polity[,c("ccode","year","polity2")], by = c("ccode1"="ccode", "year")) %>%
  left_join(cinc[,c("ccode","year","cinc")], by = c("ccode1"="ccode", "year")) %>%
  left_join(qog, by = c("ccode1"="ccodecow", "year")) %>%
  left_join(allystate,by=c("ccode1","year")) %>%
  mutate(conf.major.wght.either = host.major.either / count.major.either,
         conf.minor.wght.both = host.minor.both / (count.partners - count.major.either),
         coop.major.wght.either = coop.major.either / count.major.either,
         coop.minor.wght.both = coop.minor.both / (count.partners - count.major.either)
  ) %>% arrange(ccode1, year) %>% group_by(ccode1) %>%
  mutate(post.conf.major.wght.either=lead(conf.major.wght.either, 1),
         post.conf.minor.wght.both=lead(conf.minor.wght.both, 1),
         post.coop.major.wght.either=lead(coop.major.wght.either, 1),
         post.coop.minor.wght.both=lead(coop.minor.wght.both, 1),
         post.host.major.either=lead(host.major.either, 1),
         post.host.minor.both=lead(host.minor.both, 1),
         post.coop.major.either=lead(coop.major.either, 1),
         post.coop.minor.both=lead(coop.minor.both, 1)) %>% ungroup() %>% 
  left_join(convert, by="year") %>%
  mutate(export.const = exports.all/divide_current_value_by_this,
         import.const = imports.all/divide_current_value_by_this,
         all.trade.con = (imports.all + exports.all) / divide_current_value_by_this,
         trade.gdp = 1000*((imports.all + exports.all) ) / (wdi_gdppppcur),
         major.gdp = (1000*(imports.major + exports.major) ) / (wdi_gdppppcur),
         minor.gdp = (1000*(imports.all + exports.all - imports.major - exports.major)) / (wdi_gdppppcur),
         log.major.con = log(imports.major + exports.major+1) / divide_current_value_by_this, # add 1 here instead of logdelta to avoid negative values 
         log.minor.con = log(imports.all + exports.all - imports.major - exports.major+1) / divide_current_value_by_this, # add 1 here instead of logdelta to avoid negative values
         weight.NHHI.both = (weight.NHHI.i * (import.const/(import.const + export.const))) + (weight.NHHI.e * (export.const/(import.const + export.const))),
         weight.HHI.both = (weight.HHI.i * (import.const/(import.const + export.const))) + (weight.HHI.e * (export.const/(import.const + export.const))),
         log.all.trade.con = log(all.trade.con+1), # add 1 here instead of logdelta to avoid negative values
         count.minor.both = count.partners - count.major.either,
         trade.balance = log(exports.all+logdelta) - log(imports.all+logdelta),
         dem = ifelse(polity2 >= 7, 1, 0),
         aut = ifelse(polity2 <= -7, 1, 0),
         pol.missing = ifelse(is.na(polity2), 1, 0)) %>%
  replace_na(list(dem=0, aut=0)) %>%
  left_join(kof, by=c("ccode1"="ccode", "year")) %>%
  mutate(KOFEcGIdf.0 = KOFEcGIdf - mean(KOFEcGIdf, na.rm = TRUE)) %>%
  left_join(pta_country,by=c("iso1", "year")) %>%
  filter(year < 2013)

